Title level 1

This is the second day of Software Carpentry! I hope everyone is enjoying using git.

Title level 2

Here is some text in the second level of the document.

Here is a line that will show up on the web.

bold

italics

Load packages

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
source("functions.R")

I downloaded the file and loaded it into R

download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder-FiveYearData.csv", destfile = "data/gapminder-FiveYearData.csv")

gapminder <- read.csv("data/gapminder-FiveYearData.csv")

head(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134

I wonder if rstats increases life expectancy over the years

p <- ggplot(data=gapminder,aes(x=year,y=lifeExp)) +
    geom_point()

p

Let’s see the interactive version

ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Making your own functions

If you are repeating yourself in your code, you may be able to solve that problem by making your own function!

cars <- c(3,4,5,6,7,10)

se(cars)
## [1] 1.013794

Data manipulation with dplyr

You will likely want to get subsections of your dataframe and/or calculate means of a variable for a certain subsection, dplyr is your friend!

Explored select

gapminder <- read.csv("data/gapminder-FiveYearData.csv")

year_country_gdp <- select(gapminder,year, country, gdpPercap)
year_country_gdp <- select(gapminder,-pop, -continent, -lifeExp)
names(year_country_gdp)
## [1] "country"   "year"      "gdpPercap"

Explore filter

euro <- filter(gapminder,continent=="Europe")
year_country_gdp_euro <- select(euro,year, country, gdpPercap)

year_country_gdp_euro <- gapminder  %>% 
    filter(continent=="Europe") %>% 
    select(year, country, gdpPercap)

exploring the amazing group_by and summarize functions

mean_gdp_percountry <- gapminder %>% 
    group_by(country) %>% 
    summarise(mean_gdp=mean(gdpPercap),
              se_gdp=se(gdpPercap))

mean_gdp_percountry
## # A tibble: 142 x 3
##        country   mean_gdp     se_gdp
##         <fctr>      <dbl>      <dbl>
##  1 Afghanistan   802.6746   31.23550
##  2     Albania  3255.3666  344.20223
##  3     Algeria  4426.0260  378.26190
##  4      Angola  3607.1005  336.56641
##  5   Argentina  8955.5538  537.68144
##  6   Australia 19980.5956 2256.11315
##  7     Austria 20411.9163 2787.23968
##  8     Bahrain 18077.6639 1563.29518
##  9  Bangladesh   817.5588   67.86165
## 10     Belgium 19900.7581 2422.32683
## # ... with 132 more rows

Challenge: I want the mean, se, and sample size of life expetancy by continent

mean_se_life_percontinent<-gapminder %>% 
  group_by(continent,country) %>% 
  summarise(mean_life=mean(lifeExp),
            se_life=se(lifeExp),
            samsize_life=n())

mean_se_life_percontinent
## # A tibble: 142 x 5
## # Groups:   continent [?]
##    continent                  country mean_life   se_life samsize_life
##       <fctr>                   <fctr>     <dbl>     <dbl>        <int>
##  1    Africa                  Algeria  59.03017 2.9849208           12
##  2    Africa                   Angola  37.88350 1.1562236           12
##  3    Africa                    Benin  48.77992 1.7691977           12
##  4    Africa                 Botswana  54.59750 1.7116922           12
##  5    Africa             Burkina Faso  44.69400 1.9762099           12
##  6    Africa                  Burundi  44.81733 0.9165096           12
##  7    Africa                 Cameroon  48.12850 1.5784640           12
##  8    Africa Central African Republic  43.86692 1.3627459           12
##  9    Africa                     Chad  46.77358 1.4110376           12
## 10    Africa                  Comoros  52.38175 2.3476081           12
## # ... with 132 more rows

combining ggplot and dplyr

euro_countries <- gapminder %>% 
    filter(continent=="Europe") %>% 
    ggplot(aes(x=year,y=lifeExp,color=country)) +
    geom_line()+
    facet_wrap(~country)

euro_countries

ggsave("euro.png")
## Saving 7 x 5 in image
write.csv(mean_gdp_percountry,"processed/mean_gdp_percountry.csv")

Data manipulation with tidyr

# command to download the 'wide' data
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/data/gapminder_wide.csv", destfile = "data/gapminder_wide.csv")

gapminder_wide <- read.csv("data/gapminder_wide.csv")

gap_long <- gapminder_wide %>% 
    gather(obstype_year,
           obs_values,
           3:38)

head(gap_long)
##   continent      country   obstype_year obs_values
## 1    Africa      Algeria gdpPercap_1952  2449.0082
## 2    Africa       Angola gdpPercap_1952  3520.6103
## 3    Africa        Benin gdpPercap_1952  1062.7522
## 4    Africa     Botswana gdpPercap_1952   851.2411
## 5    Africa Burkina Faso gdpPercap_1952   543.2552
## 6    Africa      Burundi gdpPercap_1952   339.2965

separate the obs_type column

gap_normal <- gap_long %>% 
    separate(obstype_year,into=c("obs_type","year"),sep="_") %>% 
    spread(obs_type,obs_values)

head(gap_normal)
##   continent country year gdpPercap lifeExp      pop
## 1    Africa Algeria 1952  2449.008  43.077  9279525
## 2    Africa Algeria 1957  3013.976  45.685 10270856
## 3    Africa Algeria 1962  2550.817  48.303 11000948
## 4    Africa Algeria 1967  3246.992  51.407 12760499
## 5    Africa Algeria 1972  4182.664  54.518 14760787
## 6    Africa Algeria 1977  4910.417  58.014 17152804
all.equal(gapminder,gap_normal)
##  [1] "Names: 5 string mismatches"                                                                            
##  [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
##  [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"                                
##  [4] "Component 1: 1704 string mismatches"                                                                   
##  [5] "Component 2: Attributes: < target is NULL, current is list >"                                          
##  [6] "Component 2: target is numeric, current is factor"                                                     
##  [7] "Component 3: Modes: numeric, character"                                                                
##  [8] "Component 3: target is numeric, current is character"                                                  
##  [9] "Component 4: 'current' is not a factor"                                                                
## [10] "Component \"lifeExp\": Mean relative difference: 0.203822"                                             
## [11] "Component 6: Mean relative difference: 4101.546"
gap_normal <- gap_normal %>% 
    arrange(country,continent,year)
all.equal(gapminder,gap_normal)
##  [1] "Names: 5 string mismatches"                                                                            
##  [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
##  [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"                                
##  [4] "Component 1: 1704 string mismatches"                                                                   
##  [5] "Component 2: Attributes: < target is NULL, current is list >"                                          
##  [6] "Component 2: target is numeric, current is factor"                                                     
##  [7] "Component 3: Modes: numeric, character"                                                                
##  [8] "Component 3: target is numeric, current is character"                                                  
##  [9] "Component 4: 'current' is not a factor"                                                                
## [10] "Component 6: Mean relative difference: 4101.546"